1 Introduction

Objectives: The goal of this kernel is to analyze the effect and flavor of 3 different type of Cannabis.

EDA includes datatable, skim, and plotly and xgboost with histogram is used as the model for this analysis.

2 Basic Set up

2.1 Load Packages

# library(qdap)

2.2 Load Dataset

weed <- read_csv("input/cannabis.csv")

3 Glimpses

3.1 datatable

weed %>% 
  datatable(filter = 'top', options = list(
  pageLength = 15, autoWidth = TRUE


3.2 skim

weed %>% skim() %>% kable()
## Skim summary statistics  
##  n obs: 2351    
##  n variables: 6    
## Variable type: character
## variable      missing   complete   n      min   max    empty   n_unique 
## ------------  --------  ---------  -----  ----  -----  ------  ---------
## Description   33        2318       2351   4     1120   0       2312     
## Effects       0         2351       2351   4     46     0       1655     
## Flavor        46        2305       2351   3     30     0       1293     
## Strain        0         2351       2351   2     30     0       2350     
## Type          0         2351       2351   6     6      0       3        
## Variable type: numeric
## variable   missing   complete   n      mean   sd     p0   p25   p50   p75   p100   hist     
## ---------  --------  ---------  -----  -----  -----  ---  ----  ----  ----  -----  ---------
## Rating     0         2351       2351   4.31   0.84   0    4.2   4.4   4.7   5      <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2585><U+2587>

3.3 glimpse

weed %>% glimpse()
## Observations: 2,351
## Variables: 6
## $ Strain      <chr> "100-Og", "98-White-Widow", "1024", "13-Dawgs", "2...
## $ Type        <chr> "hybrid", "hybrid", "sativa", "hybrid", "hybrid", ...
## $ Rating      <dbl> 4.0, 4.7, 4.4, 4.2, 4.6, 0.0, 4.4, 4.2, 4.6, 4.4, ...
## $ Effects     <chr> "Creative,Energetic,Tingly,Euphoric,Relaxed", "Rel...
## $ Flavor      <chr> "Earthy,Sweet,Citrus", "Flowery,Violet,Diesel", "S...
## $ Description <chr> "$100 OG is a 50/50 hybrid strain that packs a str...

4 TreeMaps

4.1 TreeMap by Type

by_type <- weed %>% 
hchart(by_type, type = 'treemap', hcaes(x = 'Type', value = 'n', color = 'n'))

4.2 TreeMap by Effects

weed_effects <- weed %>% 
  mutate(Effects = str_split(Effects,',')) %>% 

weed_effects %>% 
  count(Effects) %>% 
  hchart(type = 'treemap', hcaes(x = 'Effects', value = 'n', color = 'n'))

4.3 TreeMap by Flavors

weed_flavor <- weed %>% 
  filter(Flavor != 'none') %>% 
  mutate(Flavor = str_split(Flavor, ',')) %>% 

weed_flavor %>% 
  count(Flavor) %>% 
  hchart(type = 'treemap', hcaes(x = 'Flavor', value = 'n', color = 'n'))

5 Drill Down Graph for 3 Types by Effects and by Flavors

df1 <- weed %>% 
  group_by(name = Type, drilldown = Type) %>% 
  summarise(y = n()) %>% 

df2 <- weed_effects %>% 
  group_by(Type, Effects) %>% 
  mutate(y = n(), colorByPoint =  1) %>% 
  arrange(desc(y)) %>%
  group_by(name = Type, id = Type, colorByPoint) %>% 
  do(data = list_parse(
    mutate(.,name = Effects, drilldown = tolower(paste(Type, Effects,sep=": "))) %>% 
      group_by(name, drilldown) %>% 
      summarise(y=n()) %>% 
      select(name, y, drilldown) %>%

(a <- highchart() %>% 
  hc_chart(type = 'bar') %>% 
  hc_xAxis(type = "category") %>% 
  hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>% 
    allowPointDrilldown = TRUE,
    series =list_parse(df2)
  ) %>%
  hc_legend(enabled = F) %>% 
  hc_title(text = "Type of Cannbis vs Effects") %>% 
rm(df1, df2)

df1 <- weed %>% 
  group_by(name = Type, drilldown = Type) %>% 
  summarise(y = n()) %>% 

df2 <- weed_flavor %>% 
  group_by(Type, Flavor) %>% 
  mutate(y = n(), colorByPoint = 1) %>% 
  arrange(desc(y)) %>%
  group_by(name = Type, id = Type, colorByPoint) %>% 
  do(data = list_parse(
    mutate(.,name = Flavor, drilldown = tolower(paste(Type, Flavor,sep=": "))) %>% 
      group_by(name, drilldown) %>% 
      summarise(y=n()) %>% 
      select(name, y, drilldown) %>%

b <- highchart() %>% 
  hc_chart(type = 'bar') %>% 
  hc_xAxis(type = "category") %>% 
  hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>% 
    allowPointDrilldown = TRUE,
    series =list_parse(df2)
  ) %>%
  hc_legend(enabled = F) %>% 
  hc_title(text = "Type of Cannbis vs Flavor") %>% 
rm(df1, df2)

lst <- list(

hw_grid(lst, rowheight = 400)
rm(a, b, lst)

6 NLP Setup

6.1 clean corpus

# clean corpus
cleanCorpus <- function(corpus){
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  v_stopwords <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
  corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
  corpus.tmp <- tm_map(corpus.tmp, removeNumbers)

6.2 frequent terms

# frequent terms 
frequentTerms <- function(text){
  s.cor <- Corpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)

6.3 clean by each type

# clean by each Type
clean_top_char <- function(dataset){
  all_dialogue <- list()
  namelist <- list()
  for (i in 1:3){
    top <- dataset %>% count(Type) %>% arrange(desc(n)) %>% head(20)
    name <- top$Type[i]
    Description <- paste(dataset$Description[dataset$Type == name], collapse = " ")
    all_dialogue <- c(all_dialogue, Description)
    namelist <- c(namelist, name)
  all_clean <- all_dialogue %>% 
    VectorSource() %>% 
    Corpus() %>% 
    cleanCorpus() %>% 
    TermDocumentMatrix() %>%
  colnames(all_clean) <- namelist
  all_clean %>% head()

weed %>% clean_top_char()
##              Docs
## Terms         hybrid indica sativa
##   abandon          1      0      0
##   abate            8      5      5
##   abates           1      1      0
##   abating          3      1      0
##   abbreviated      1      0      0
##   abduct           1      0      0

7 Top 30 Words in Description

weed$Description %>% 
  frequentTerms() %>% 
  # dim()
  head(30) %>% 
  mutate(word = factor(word))%>% 
  plot_ly(x = ~reorder(word,-freq), y = ~freq, colors = viridis(10)) %>%
  add_bars(color = ~word) %>%
  layout(title = "Top 30 Words", 
         yaxis = list(title = " "), 
         xaxis = list(title = ""), 
         margin = list(l = 100))

8 WordCloud

8.1 Commonality Cloud

commonality.cloud(all_clean[,c("sativa","indica")], colors = "steelblue1", at.least = 2, max.words = 100)

8.2 Comparison Cloud

comparison.cloud(all_clean[,c("sativa","indica")], colors = c("#F8766D", "#00BFC4"), max.words=50)

9 Pramid Plot

common_words <- all_clean %>%
  as.data.frame() %>% 
  rownames_to_column() %>% 
  filter(sativa>0, indica>0) %>% 
  # select(sativa, indica)
  mutate(difference = abs(sativa - indica)) %>% 

common_words_25 <- common_words%>%

pyramid.plot(common_words_25$sativa, common_words_25$indica,
             labels = common_words_25$rowname, gap = 200,
             top.labels = c("sativa", "Words", "indica"),
             main = "Words in Common", laxlab = NULL, 
             raxlab = NULL, unit = NULL)

## [1] 5.1 4.1 4.1 2.1
rm(common_words, common_words_25)

10 3D Plotly

effects <- weed_effects$Effects %>% unique() %>% tolower()

effectByType <- all_clean %>%
  as.data.frame() %>% 
  rownames_to_column('word') %>% 
  filter(word %in% effects) %>% 

effectByType %>% 
  plot_ly(x=~hybrid,y=~sativa,z= ~indica, color=~word, hoverinfo = 'text', colors = viridis(15),
          text = ~paste('Effects:', word,
                        '<br>hybrid:', hybrid,
                        '<br>sativa:', sativa,
                        '<br>indica:', indica)) %>% 
  add_markers(opacity = 0.8) %>%
  layout(title = "Effects by Different Cannabis",
         annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="Effects",showarrow=F),
         scene = list(xaxis = list(title = 'hybrid'),
                      yaxis = list(title = 'sativa'),
                      zaxis = list(title = 'indica')))

11 Conclusion

